library(tidyverse)
library(janitor)
library(mosaic)
library(ggfortify)
library(GGally)
library(modelr)
avocado <- read_csv("data/avocado.csv") %>% clean_names()
New names:Rows: 18249 Columns: 14── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): type, region
dbl (11): ...1, AveragePrice, Total Volume, 4046, 4225, 4770, Total Bags, Small Bags, Large Bags, XLarge Bags, year
date (1): Date
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
avocado_clean <- avocado %>%
select(-x1) %>%
mutate(month = month(date)) %>%
select(-date) %>%
rename_with(~ sub("^x4", "code_4", .x), starts_with("x")) %>%
mutate(type = as_factor(type),
year = as_factor(year),
region = as_factor(region),
month = as_factor(month))
alias(lm(average_price ~ ., data = avocado_clean))
Model :
average_price ~ total_volume + code_4046 + code_4225 + code_4770 +
total_bags + small_bags + large_bags + x_large_bags + type +
year + region + month
all_model2 <- lm(log(average_price) ~ ., data = avocado_new)
summary(all_model2)
Call:
lm(formula = log(average_price) ~ ., data = avocado_new)
Residuals:
Min 1Q Median 3Q Max
-1.15376 -0.09774 0.01044 0.11220 0.66405
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.534e-01 1.032e-02 14.864 < 2e-16 ***
total_volume -4.053e-05 2.631e-05 -1.541 0.123450
code_4046 4.050e-05 2.631e-05 1.540 0.123686
code_4225 4.054e-05 2.631e-05 1.541 0.123348
code_4770 4.049e-05 2.631e-05 1.539 0.123738
code_other NA NA NA NA
total_bags -1.898e-02 1.969e-02 -0.964 0.335095
small_bags 1.902e-02 1.969e-02 0.966 0.334067
large_bags 1.902e-02 1.969e-02 0.966 0.334069
x_large_bags 1.902e-02 1.969e-02 0.966 0.334043
typeorganic 3.527e-01 2.667e-03 132.207 < 2e-16 ***
year2016 -3.464e-02 3.274e-03 -10.581 < 2e-16 ***
year2017 9.043e-02 3.280e-03 27.573 < 2e-16 ***
year2018 7.470e-02 5.801e-03 12.878 < 2e-16 ***
month2 -3.618e-02 5.632e-03 -6.424 1.36e-10 ***
month3 1.674e-02 5.549e-03 3.016 0.002563 **
month4 5.012e-02 6.089e-03 8.231 < 2e-16 ***
month5 3.767e-02 5.967e-03 6.314 2.79e-10 ***
month6 7.840e-02 6.248e-03 12.547 < 2e-16 ***
month7 1.184e-01 5.995e-03 19.755 < 2e-16 ***
month8 1.557e-01 6.096e-03 25.542 < 2e-16 ***
month9 1.899e-01 6.224e-03 30.515 < 2e-16 ***
month10 1.992e-01 5.962e-03 33.413 < 2e-16 ***
month11 1.211e-01 6.090e-03 19.884 < 2e-16 ***
month12 2.207e-02 6.087e-03 3.626 0.000289 ***
avocado_per_bag -2.145e-05 1.116e-05 -1.921 0.054704 .
region_Atlanta -1.738e-01 1.311e-02 -13.260 < 2e-16 ***
region_BaltimoreWashington -2.035e-02 1.311e-02 -1.552 0.120622
region_Boise -1.808e-01 1.309e-02 -13.813 < 2e-16 ***
region_Boston -2.671e-02 1.311e-02 -2.037 0.041629 *
region_BuffaloRochester -2.525e-02 1.309e-02 -1.929 0.053739 .
region_California -1.323e-01 1.339e-02 -9.877 < 2e-16 ***
region_Charlotte 1.448e-02 1.309e-02 1.106 0.268841
region_Chicago -9.329e-03 1.316e-02 -0.709 0.478446
region_CincinnatiDayton -2.770e-01 1.310e-02 -21.145 < 2e-16 ***
region_Columbus -2.318e-01 1.309e-02 -17.709 < 2e-16 ***
region_DallasFtWorth -3.754e-01 1.312e-02 -28.623 < 2e-16 ***
region_Denver -2.549e-01 1.317e-02 -19.354 < 2e-16 ***
region_Detroit -2.169e-01 1.312e-02 -16.539 < 2e-16 ***
region_GrandRapids -5.072e-02 1.309e-02 -3.874 0.000108 ***
region_GreatLakes -1.627e-01 1.360e-02 -11.965 < 2e-16 ***
region_HarrisburgScranton -3.497e-02 1.309e-02 -2.672 0.007549 **
region_HartfordSpringfield 1.352e-01 1.309e-02 10.328 < 2e-16 ***
region_Houston -4.123e-01 1.311e-02 -31.446 < 2e-16 ***
region_Indianapolis -1.765e-01 1.309e-02 -13.481 < 2e-16 ***
region_Jacksonville -5.157e-02 1.309e-02 -3.939 8.21e-05 ***
region_LasVegas -1.565e-01 1.309e-02 -11.954 < 2e-16 ***
region_LosAngeles -2.778e-01 1.330e-02 -20.890 < 2e-16 ***
region_Louisville -2.047e-01 1.309e-02 -15.638 < 2e-16 ***
region_MiamiFtLauderdale -9.092e-02 1.311e-02 -6.937 4.14e-12 ***
region_Midsouth -1.066e-01 1.322e-02 -8.065 7.79e-16 ***
region_Nashville -2.692e-01 1.309e-02 -20.560 < 2e-16 ***
region_NewOrleansMobile -1.875e-01 1.309e-02 -14.324 < 2e-16 ***
region_NewYork 8.614e-02 1.321e-02 6.520 7.23e-11 ***
region_Northeast 5.965e-03 1.416e-02 0.421 0.673563
region_NorthernNewEngland -5.863e-02 1.310e-02 -4.477 7.63e-06 ***
region_Orlando -4.859e-02 1.310e-02 -3.710 0.000208 ***
region_Philadelphia 4.251e-02 1.310e-02 3.246 0.001172 **
region_PhoenixTucson -3.236e-01 1.313e-02 -24.643 < 2e-16 ***
region_Pittsburgh -1.243e-01 1.309e-02 -9.496 < 2e-16 ***
region_Plains -8.722e-02 1.313e-02 -6.645 3.12e-11 ***
region_Portland -2.048e-01 1.311e-02 -15.619 < 2e-16 ***
region_RaleighGreensboro -1.977e-02 1.310e-02 -1.510 0.131129
region_RichmondNorfolk -1.898e-01 1.309e-02 -14.499 < 2e-16 ***
region_Roanoke -2.286e-01 1.309e-02 -17.465 < 2e-16 ***
region_Sacramento 1.956e-02 1.309e-02 1.494 0.135129
region_SanDiego -1.429e-01 1.309e-02 -10.912 < 2e-16 ***
region_SanFrancisco 1.177e-01 1.336e-02 8.806 < 2e-16 ***
region_Seattle -1.092e-01 1.312e-02 -8.319 < 2e-16 ***
region_SouthCarolina -1.136e-01 1.309e-02 -8.680 < 2e-16 ***
region_SouthCentral -3.351e-01 1.356e-02 -24.723 < 2e-16 ***
region_Southeast -9.374e-02 1.346e-02 -6.966 3.37e-12 ***
region_Spokane -1.097e-01 1.314e-02 -8.348 < 2e-16 ***
region_StLouis -1.102e-01 1.310e-02 -8.411 < 2e-16 ***
region_Syracuse -2.035e-02 1.309e-02 -1.555 0.119960
region_Tampa -1.091e-01 1.310e-02 -8.331 < 2e-16 ***
region_TotalUS -1.135e-01 1.633e-02 -6.953 3.70e-12 ***
region_West -1.952e-01 1.365e-02 -14.299 < 2e-16 ***
region_WestTexNewMexico -2.596e-01 1.315e-02 -19.743 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1701 on 18156 degrees of freedom
(15 observations deleted due to missingness)
Multiple R-squared: 0.6563, Adjusted R-squared: 0.6548
F-statistic: 450.2 on 77 and 18156 DF, p-value: < 2.2e-16
model1 <- lm(average_price ~ code_4046, data = avocado_trim)
summary(model1)
Call:
lm(formula = average_price ~ code_4046, data = avocado_trim)
Residuals:
Min 1Q Median 3Q Max
-0.98539 -0.29842 -0.03531 0.25459 1.82475
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.425e+00 2.993e-03 476.29 <2e-16 ***
code_4046 -6.631e-08 2.305e-09 -28.77 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3939 on 18247 degrees of freedom
Multiple R-squared: 0.0434, Adjusted R-squared: 0.04334
F-statistic: 827.8 on 1 and 18247 DF, p-value: < 2.2e-16
R^2 - model explains 4.3% of the variance in average price
interpretation
A 1 unit increase in total avocados sold with the code 4046 is associated with a -$6.63 drop in price on average
graph 1 - sample population data is not independent graph 2 - data is not normally distributed graph 3 - line shows graident - there is heteroskedasticity in the data set graph 4 - cannot see cook’s lines - which is fine
We can log transform average price to get a more normal distribution of the data
model1 <- lm(log(average_price) ~ code_4046, data = avocado_new)
summary(model1)
Call:
lm(formula = log(average_price) ~ code_4046, data = avocado_new)
Residuals:
Min 1Q Median 3Q Max
-1.13550 -0.19950 0.01485 0.20426 0.86424
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.145e-01 2.146e-03 146.57 <2e-16 ***
code_4046 -5.105e-08 1.653e-09 -30.89 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2824 on 18247 degrees of freedom
Multiple R-squared: 0.04969, Adjusted R-squared: 0.04964
F-statistic: 954.1 on 1 and 18247 DF, p-value: < 2.2e-16
better R^2 (5% of variance explained)
graph 1 - unchanged graph 2 - more normal distribution - bell shaped curve graph 3 - unchanged graph 4 - unchanged
residuals <- avocado_new %>%
add_residuals(model1) %>%
select(-c(average_price, code_4046))
# seperate into numeric and non-numeric
avocado_resid_numeric <- residuals %>%
select_if(is.numeric)
avocado_resid_nonnumeric <- residuals %>%
select_if(function(x) !is.numeric(x))
avocado_resid_nonnumeric$average_price <- avocado_trim$average_price
ggpairs(avocado_resid_numeric)
ggpairs(avocado_resid_nonnumeric)
residuals suggest that avocado_per_bag is the highest correlated numeric factor with average price.
However, the non-numeric graph indicates that the type is highly corrleated given the distribution of data and the boxplot graph
will include type next
model2 <- lm(log(average_price) ~ code_4046 + type, data = avocado_new)
summary(model2)
Call:
lm(formula = log(average_price) ~ code_4046 + type, data = avocado_new)
Residuals:
Min 1Q Median 3Q Max
-1.29949 -0.13465 0.00396 0.14883 0.70019
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.325e-01 2.499e-03 53.02 <2e-16 ***
code_4046 -2.016e-08 1.361e-09 -14.81 <2e-16 ***
typeorganic 3.460e-01 3.444e-03 100.47 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2266 on 18246 degrees of freedom
Multiple R-squared: 0.3882, Adjusted R-squared: 0.3881
F-statistic: 5789 on 2 and 18246 DF, p-value: < 2.2e-16
we can see that type has dramatically increased the fit of our model, with the R^2 explaining 38.8% of the variance in average price
result interpretation:
An increase in typeorganic by 1 unit is associated with a change in average_price by 3.4%, holding all other factors constant.
graph 1 - population seems to be indepedent - two distinct populatons likely due to their being two distinct types graph 2 - population distribution seems less normal - may have to log transform graph 3 - less gradient meaning the conditional variance of residauls is constant –> homoskedasticity rather than hetero…
anova(model2, model1)
Analysis of Variance Table
Model 1: log(average_price) ~ code_4046 + type
Model 2: log(average_price) ~ code_4046
Res.Df RSS Df Sum of Sq F Pr(>F)
1 18246 936.88
2 18247 1455.24 -1 -518.36 10095 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
statiscally significant (already knew that) looks good to use (obviously)
residuals <- avocado_new %>%
add_residuals(model2) %>%
select(-c(average_price, code_4046, type))
# seperate into numeric and non-numeric
avocado_resid_numeric <- residuals %>%
select_if(is.numeric)
avocado_resid_nonnumeric <- residuals %>%
select_if(function(x) !is.numeric(x))
avocado_resid_nonnumeric$average_price <- avocado_trim$average_price
ggpairs(avocado_resid_numeric)
ggpairs(avocado_resid_nonnumeric)
highest correlated numeric is large_bag but doesnt seem to be highly correlated at all (0.064).
Month or year may be a good variable to use indicated by the box plots
I would like to try and use the regionv variable which may take some coding to rearrange so that it can be useable. might have to change to binary columns.
model3 <- lm(log(average_price) ~ code_4046 + type + month, data = avocado_new)
summary(model3)
Call:
lm(formula = log(average_price) ~ code_4046 + type + month, data = avocado_new)
Residuals:
Min 1Q Median 3Q Max
-1.25035 -0.13044 0.00733 0.14929 0.69870
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.656e-02 5.198e-03 12.806 < 2e-16 ***
code_4046 -1.883e-08 1.296e-09 -14.530 < 2e-16 ***
typeorganic 3.468e-01 3.276e-03 105.857 < 2e-16 ***
month2 -3.540e-02 7.127e-03 -4.967 6.86e-07 ***
month3 1.604e-02 7.015e-03 2.287 0.02221 *
month4 4.172e-02 7.549e-03 5.527 3.31e-08 ***
month5 1.960e-02 7.391e-03 2.652 0.00801 **
month6 6.758e-02 7.733e-03 8.739 < 2e-16 ***
month7 1.107e-01 7.391e-03 14.977 < 2e-16 ***
month8 1.424e-01 7.549e-03 18.863 < 2e-16 ***
month9 1.765e-01 7.730e-03 22.830 < 2e-16 ***
month10 1.865e-01 7.392e-03 25.230 < 2e-16 ***
month11 1.050e-01 7.550e-03 13.902 < 2e-16 ***
month12 1.428e-02 7.551e-03 1.891 0.05861 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2155 on 18235 degrees of freedom
Multiple R-squared: 0.4468, Adjusted R-squared: 0.4464
F-statistic: 1133 on 13 and 18235 DF, p-value: < 2.2e-16
should I use this variable?
anova(model3, model2)
Analysis of Variance Table
Model 1: log(average_price) ~ code_4046 + type + month
Model 2: log(average_price) ~ code_4046 + type
Res.Df RSS Df Sum of Sq F Pr(>F)
1 18235 847.18
2 18246 936.88 -11 -89.705 175.53 < 2.2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Yes.
graph 1 - looks good for distribution, some very distinct population areas though missing some key bits of data from pop hence why R^2 is 45%
graph 2 - still close to a normal distribution
graph 3 - looks good, homoskedastic!
residuals <- avocado_new %>%
add_residuals(model2) %>%
select(-c(average_price, code_4046, type, month))
# seperate into numeric and non-numeric
avocado_resid_numeric <- residuals %>%
select_if(is.numeric)
avocado_resid_nonnumeric <- residuals %>%
select_if(function(x) !is.numeric(x))
avocado_resid_nonnumeric$average_price <- avocado_trim$average_price
ggpairs(avocado_resid_numeric)
ggpairs(avocado_resid_nonnumeric)
model4 <- lm(log(average_price) ~ code_4046 + type + month, data = avocado_new)
summary(model4)
Call:
lm(formula = log(average_price) ~ code_4046 + type + month, data = avocado_new)
Residuals:
Min 1Q Median 3Q Max
-1.25035 -0.13044 0.00733 0.14929 0.69870
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.656e-02 5.198e-03 12.806 < 2e-16 ***
code_4046 -1.883e-08 1.296e-09 -14.530 < 2e-16 ***
typeorganic 3.468e-01 3.276e-03 105.857 < 2e-16 ***
month2 -3.540e-02 7.127e-03 -4.967 6.86e-07 ***
month3 1.604e-02 7.015e-03 2.287 0.02221 *
month4 4.172e-02 7.549e-03 5.527 3.31e-08 ***
month5 1.960e-02 7.391e-03 2.652 0.00801 **
month6 6.758e-02 7.733e-03 8.739 < 2e-16 ***
month7 1.107e-01 7.391e-03 14.977 < 2e-16 ***
month8 1.424e-01 7.549e-03 18.863 < 2e-16 ***
month9 1.765e-01 7.730e-03 22.830 < 2e-16 ***
month10 1.865e-01 7.392e-03 25.230 < 2e-16 ***
month11 1.050e-01 7.550e-03 13.902 < 2e-16 ***
month12 1.428e-02 7.551e-03 1.891 0.05861 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2155 on 18235 degrees of freedom
Multiple R-squared: 0.4468, Adjusted R-squared: 0.4464
F-statistic: 1133 on 13 and 18235 DF, p-value: < 2.2e-16
this model doesnt really explain anymore so will use model 3 going forward
code_4046:type, code_4046:month, type:month
model5a <- lm(log(average_price) ~ code_4046 + type + month + code_4046:type, data = avocado_new)
summary(model5a)
Call:
lm(formula = log(average_price) ~ code_4046 + type + month +
code_4046:type, data = avocado_new)
Residuals:
Min 1Q Median 3Q Max
-1.25712 -0.13120 0.00856 0.14927 0.69271
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.643e-02 5.186e-03 12.809 < 2e-16 ***
code_4046 -1.868e-08 1.293e-09 -14.452 < 2e-16 ***
typeorganic 3.532e-01 3.345e-03 105.604 < 2e-16 ***
month2 -3.491e-02 7.111e-03 -4.910 9.20e-07 ***
month3 1.669e-02 6.999e-03 2.384 0.01714 *
month4 4.249e-02 7.533e-03 5.640 1.72e-08 ***
month5 2.066e-02 7.376e-03 2.801 0.00511 **
month6 6.818e-02 7.716e-03 8.836 < 2e-16 ***
month7 1.107e-01 7.374e-03 15.007 < 2e-16 ***
month8 1.424e-01 7.532e-03 18.903 < 2e-16 ***
month9 1.761e-01 7.713e-03 22.828 < 2e-16 ***
month10 1.857e-01 7.376e-03 25.173 < 2e-16 ***
month11 1.041e-01 7.533e-03 13.812 < 2e-16 ***
month12 1.326e-02 7.535e-03 1.760 0.07843 .
code_4046:typeorganic -8.719e-07 9.592e-08 -9.089 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2151 on 18234 degrees of freedom
Multiple R-squared: 0.4493, Adjusted R-squared: 0.4488
F-statistic: 1062 on 14 and 18234 DF, p-value: < 2.2e-16
check graph of interaction term
avocado_resid <- avocado_new %>%
add_residuals(model5a)
coplot(resid ~ code_4046 | type,
panel = function(x, y, ...){
points(x, y)
abline(lm(y ~ x), col = "blue")
},
data = avocado_resid, rows = 1)
code_4046:type, code_4046:month, type:month
model5b <- lm(log(average_price) ~ code_4046 + type + month + code_4046:month, data = avocado_new)
summary(model5b)
avocado_resid <- avocado_new %>%
add_residuals(model5b)
coplot(resid ~ code_4046 | month,
panel = function(x, y, ...){
points(x, y)
abline(lm(y ~ x), col = "blue")
},
data = avocado_resid, rows = 1)
model5c <- lm(log(average_price) ~ code_4046 + type + month + type:month, data = avocado_new)
summary(model5c)
avocado_resid <- avocado_new %>%
add_residuals(model5c)
coplot(resid ~ type | month,
panel = function(x, y, ...){
points(x, y)
abline(lm(y ~ x), col = "blue")
},
data = avocado_resid, rows = 1)
# all_model <- lm(log(average_price) ~ ., data = avocado_new)
#
# summary(all_model)
test / train
quiz
most likely overfit - dont need postcode or date of birth
Use the latter model - want a lower AIC score
first one as adjusted R squared is higher which accounts for adding new variables, penalises model for adding new ones that dont aid explanation of variance
No, RMSE goes down for test set so probably well fit
creates loads of samples of train test data and then averages the results of all ofall k-folds to say which is the best
could explain better
used as a final step to check the accuracy of model, the data in a validation set is neither used in the train or test set. Used to check if a model has been overfitted or not, want the model to generalise to multiple problems rather than one specific data set
start with all the independent variables in the model and deselect which ever variable lowers the R^2 the most
rather than removing or adding a variable for good, this method searches all possible combinations of variables to get the most efficient model